{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem.snowball import SnowballStemmer\n",
    "\n",
    "from sklearn.datasets import fetch_20newsgroups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>From: lerxst@wam.umd.edu (where's my thing)\\nS...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>From: guykuo@carson.u.washington.edu (Guy Kuo)...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>From: twillis@ec.ecn.purdue.edu (Thomas E Will...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>From: jgreen@amber (Joe Green)\\nSubject: Re: W...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>From: jcm@head-cfa.harvard.edu (Jonathan McDow...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text\n",
       "0  From: lerxst@wam.umd.edu (where's my thing)\\nS...\n",
       "1  From: guykuo@carson.u.washington.edu (Guy Kuo)...\n",
       "2  From: twillis@ec.ecn.purdue.edu (Thomas E Will...\n",
       "3  From: jgreen@amber (Joe Green)\\nSubject: Re: W...\n",
       "4  From: jcm@head-cfa.harvard.edu (Jonathan McDow..."
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load data\n",
    "data = fetch_20newsgroups(subset='train')\n",
    "df = pd.DataFrame(data.data, columns=['text'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "From: irwin@cmptrc.lonestar.org (Irwin Arnstein)\n",
      "Subject: Re: Recommendation on Duc\n",
      "Summary: What's it worth?\n",
      "Distribution: usa\n",
      "Expires: Sat, 1 May 1993 05:00:00 GMT\n",
      "Organization: CompuTrac Inc., Richardson TX\n",
      "Keywords: Ducati, GTS, How much? \n",
      "Lines: 13\n",
      "\n",
      "I have a line on a Ducati 900GTS 1978 model with 17k on the clock.  Runs\n",
      "very well, paint is the bronze/brown/orange faded out, leaks a bit of oil\n",
      "and pops out of 1st with hard accel.  The shop will fix trans and oil \n",
      "leak.  They sold the bike to the 1 and only owner.  They want $3495, and\n",
      "I am thinking more like $3K.  Any opinions out there?  Please email me.\n",
      "Thanks.  It would be a nice stable mate to the Beemer.  Then I'll get\n",
      "a jap bike and call myself Axis Motors!\n",
      "\n",
      "-- \n",
      "-----------------------------------------------------------------------\n",
      "\"Tuba\" (Irwin)      \"I honk therefore I am\"     CompuTrac-Richardson,Tx\n",
      "irwin@cmptrc.lonestar.org    DoD #0826          (R75/6)\n",
      "-----------------------------------------------------------------------\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# print example of text\n",
    "\n",
    "print(df['text'][10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove punctuation\n",
    "df[\"text\"] = df['text'].str.replace('[^\\w\\s]','', regex=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "From irwincmptrclonestarorg Irwin Arnstein\n",
      "Subject Re Recommendation on Duc\n",
      "Summary Whats it worth\n",
      "Distribution usa\n",
      "Expires Sat 1 May 1993 050000 GMT\n",
      "Organization CompuTrac Inc Richardson TX\n",
      "Keywords Ducati GTS How much \n",
      "Lines 13\n",
      "\n",
      "I have a line on a Ducati 900GTS 1978 model with 17k on the clock  Runs\n",
      "very well paint is the bronzebrownorange faded out leaks a bit of oil\n",
      "and pops out of 1st with hard accel  The shop will fix trans and oil \n",
      "leak  They sold the bike to the 1 and only owner  They want 3495 and\n",
      "I am thinking more like 3K  Any opinions out there  Please email me\n",
      "Thanks  It would be a nice stable mate to the Beemer  Then Ill get\n",
      "a jap bike and call myself Axis Motors\n",
      "\n",
      " \n",
      "\n",
      "Tuba Irwin      I honk therefore I am     CompuTracRichardsonTx\n",
      "irwincmptrclonestarorg    DoD 0826          R756\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# print example without punctuation\n",
    "\n",
    "print(df['text'][10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# alternative way to remove punctuation\n",
    "import string\n",
    "\n",
    "df['text'] = df['text'].str.replace('[{}]'.format(string.punctuation), '', regex=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the punctuation that will be removed\n",
    "\n",
    "string.punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove numbers, keep only text\n",
    "\n",
    "df['text'] = df['text'].str.replace('\\d+', '', regex=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "From irwincmptrclonestarorg Irwin Arnstein\n",
      "Subject Re Recommendation on Duc\n",
      "Summary Whats it worth\n",
      "Distribution usa\n",
      "Expires Sat  May   GMT\n",
      "Organization CompuTrac Inc Richardson TX\n",
      "Keywords Ducati GTS How much \n",
      "Lines \n",
      "\n",
      "I have a line on a Ducati GTS  model with k on the clock  Runs\n",
      "very well paint is the bronzebrownorange faded out leaks a bit of oil\n",
      "and pops out of st with hard accel  The shop will fix trans and oil \n",
      "leak  They sold the bike to the  and only owner  They want  and\n",
      "I am thinking more like K  Any opinions out there  Please email me\n",
      "Thanks  It would be a nice stable mate to the Beemer  Then Ill get\n",
      "a jap bike and call myself Axis Motors\n",
      "\n",
      " \n",
      "\n",
      "Tuba Irwin      I honk therefore I am     CompuTracRichardsonTx\n",
      "irwincmptrclonestarorg    DoD           R\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# print example without numbers\n",
    "\n",
    "print(df['text'][10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# put in lower case\n",
    "\n",
    "df['text'] = df['text'].str.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "from irwincmptrclonestarorg irwin arnstein\n",
      "subject re recommendation on duc\n",
      "summary whats it worth\n",
      "distribution usa\n",
      "expires sat  may   gmt\n",
      "organization computrac inc richardson tx\n",
      "keywords ducati gts how much \n",
      "lines \n",
      "\n",
      "i have a line on a ducati gts  model with k on the clock  runs\n",
      "very well paint is the bronzebrownorange faded out leaks a bit of oil\n",
      "and pops out of st with hard accel  the shop will fix trans and oil \n",
      "leak  they sold the bike to the  and only owner  they want  and\n",
      "i am thinking more like k  any opinions out there  please email me\n",
      "thanks  it would be a nice stable mate to the beemer  then ill get\n",
      "a jap bike and call myself axis motors\n",
      "\n",
      " \n",
      "\n",
      "tuba irwin      i honk therefore i am     computracrichardsontx\n",
      "irwincmptrclonestarorg    dod           r\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# print example in lower case\n",
    "\n",
    "print(df['text'][10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['i',\n",
       " 'me',\n",
       " 'my',\n",
       " 'myself',\n",
       " 'we',\n",
       " 'our',\n",
       " 'ours',\n",
       " 'ourselves',\n",
       " 'you',\n",
       " \"you're\",\n",
       " \"you've\",\n",
       " \"you'll\",\n",
       " \"you'd\",\n",
       " 'your',\n",
       " 'yours',\n",
       " 'yourself',\n",
       " 'yourselves',\n",
       " 'he',\n",
       " 'him',\n",
       " 'his',\n",
       " 'himself',\n",
       " 'she',\n",
       " \"she's\",\n",
       " 'her',\n",
       " 'hers',\n",
       " 'herself',\n",
       " 'it',\n",
       " \"it's\",\n",
       " 'its',\n",
       " 'itself',\n",
       " 'they',\n",
       " 'them',\n",
       " 'their',\n",
       " 'theirs',\n",
       " 'themselves',\n",
       " 'what',\n",
       " 'which',\n",
       " 'who',\n",
       " 'whom',\n",
       " 'this',\n",
       " 'that',\n",
       " \"that'll\",\n",
       " 'these',\n",
       " 'those',\n",
       " 'am',\n",
       " 'is',\n",
       " 'are',\n",
       " 'was',\n",
       " 'were',\n",
       " 'be',\n",
       " 'been',\n",
       " 'being',\n",
       " 'have',\n",
       " 'has',\n",
       " 'had',\n",
       " 'having',\n",
       " 'do',\n",
       " 'does',\n",
       " 'did',\n",
       " 'doing',\n",
       " 'a',\n",
       " 'an',\n",
       " 'the',\n",
       " 'and',\n",
       " 'but',\n",
       " 'if',\n",
       " 'or',\n",
       " 'because',\n",
       " 'as',\n",
       " 'until',\n",
       " 'while',\n",
       " 'of',\n",
       " 'at',\n",
       " 'by',\n",
       " 'for',\n",
       " 'with',\n",
       " 'about',\n",
       " 'against',\n",
       " 'between',\n",
       " 'into',\n",
       " 'through',\n",
       " 'during',\n",
       " 'before',\n",
       " 'after',\n",
       " 'above',\n",
       " 'below',\n",
       " 'to',\n",
       " 'from',\n",
       " 'up',\n",
       " 'down',\n",
       " 'in',\n",
       " 'out',\n",
       " 'on',\n",
       " 'off',\n",
       " 'over',\n",
       " 'under',\n",
       " 'again',\n",
       " 'further',\n",
       " 'then',\n",
       " 'once',\n",
       " 'here',\n",
       " 'there',\n",
       " 'when',\n",
       " 'where',\n",
       " 'why',\n",
       " 'how',\n",
       " 'all',\n",
       " 'any',\n",
       " 'both',\n",
       " 'each',\n",
       " 'few',\n",
       " 'more',\n",
       " 'most',\n",
       " 'other',\n",
       " 'some',\n",
       " 'such',\n",
       " 'no',\n",
       " 'nor',\n",
       " 'not',\n",
       " 'only',\n",
       " 'own',\n",
       " 'same',\n",
       " 'so',\n",
       " 'than',\n",
       " 'too',\n",
       " 'very',\n",
       " 's',\n",
       " 't',\n",
       " 'can',\n",
       " 'will',\n",
       " 'just',\n",
       " 'don',\n",
       " \"don't\",\n",
       " 'should',\n",
       " \"should've\",\n",
       " 'now',\n",
       " 'd',\n",
       " 'll',\n",
       " 'm',\n",
       " 'o',\n",
       " 're',\n",
       " 've',\n",
       " 'y',\n",
       " 'ain',\n",
       " 'aren',\n",
       " \"aren't\",\n",
       " 'couldn',\n",
       " \"couldn't\",\n",
       " 'didn',\n",
       " \"didn't\",\n",
       " 'doesn',\n",
       " \"doesn't\",\n",
       " 'hadn',\n",
       " \"hadn't\",\n",
       " 'hasn',\n",
       " \"hasn't\",\n",
       " 'haven',\n",
       " \"haven't\",\n",
       " 'isn',\n",
       " \"isn't\",\n",
       " 'ma',\n",
       " 'mightn',\n",
       " \"mightn't\",\n",
       " 'mustn',\n",
       " \"mustn't\",\n",
       " 'needn',\n",
       " \"needn't\",\n",
       " 'shan',\n",
       " \"shan't\",\n",
       " 'shouldn',\n",
       " \"shouldn't\",\n",
       " 'wasn',\n",
       " \"wasn't\",\n",
       " 'weren',\n",
       " \"weren't\",\n",
       " 'won',\n",
       " \"won't\",\n",
       " 'wouldn',\n",
       " \"wouldn't\"]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# temove stop words\n",
    "\n",
    "def remove_stopwords(text):\n",
    "    stop = set(stopwords.words('english'))\n",
    "    text = [word for word in text.split() if word not in stop]\n",
    "    text = ' '.join(x for x in text)\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'irwincmptrclonestarorg irwin arnstein subject recommendation duc summary whats worth distribution usa expires sat may gmt organization computrac inc richardson tx keywords ducati gts much lines line ducati gts model k clock runs well paint bronzebrownorange faded leaks bit oil pops st hard accel shop fix trans oil leak sold bike owner want thinking like k opinions please email thanks would nice stable mate beemer ill get jap bike call axis motors tuba irwin honk therefore computracrichardsontx irwincmptrclonestarorg dod r'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# test function on single text\n",
    "\n",
    "remove_stopwords(df['text'][10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# apply function to entire dataframe\n",
    "# (this operation takes a while)\n",
    "\n",
    "df['text'] = df['text'].apply(remove_stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "irwincmptrclonestarorg irwin arnstein subject recommendation duc summary whats worth distribution usa expires sat may gmt organization computrac inc richardson tx keywords ducati gts much lines line ducati gts model k clock runs well paint bronzebrownorange faded leaks bit oil pops st hard accel shop fix trans oil leak sold bike owner want thinking like k opinions please email thanks would nice stable mate beemer ill get jap bike call axis motors tuba irwin honk therefore computracrichardsontx irwincmptrclonestarorg dod r\n"
     ]
    }
   ],
   "source": [
    "# print example text without stopwords\n",
    "\n",
    "print(df['text'][10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stemming\n",
    "\n",
    "# http://www.nltk.org/howto/stem.html\n",
    "# for other stemmers\n",
    "\n",
    "stemmer = SnowballStemmer(\"english\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'run'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# test stemmer in one word\n",
    "stemmer.stem('running')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def stemm_words(text):\n",
    "    text = [stemmer.stem(word) for word in text.split()]\n",
    "    text = ' '.join(x for x in text)\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'irwincmptrclonestarorg irwin arnstein subject recommend duc summari what worth distribut usa expir sat may gmt organ computrac inc richardson tx keyword ducati gts much line line ducati gts model k clock run well paint bronzebrownorang fade leak bit oil pop st hard accel shop fix tran oil leak sold bike owner want think like k opinion pleas email thank would nice stabl mate beemer ill get jap bike call axi motor tuba irwin honk therefor computracrichardsontx irwincmptrclonestarorg dod r'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# test function on single text\n",
    "\n",
    "stemm_words(df['text'][10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# stem entire dataframe\n",
    "df['text'] = df['text'].apply(stemm_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "irwincmptrclonestarorg irwin arnstein subject recommend duc summari what worth distribut usa expir sat may gmt organ computrac inc richardson tx keyword ducati gts much line line ducati gts model k clock run well paint bronzebrownorang fade leak bit oil pop st hard accel shop fix tran oil leak sold bike owner want think like k opinion pleas email thank would nice stabl mate beemer ill get jap bike call axi motor tuba irwin honk therefor computracrichardsontx irwincmptrclonestarorg dod r\n"
     ]
    }
   ],
   "source": [
    "# print example with stemmed words\n",
    "\n",
    "print(df['text'][10])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fsml",
   "language": "python",
   "name": "fsml"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
